knitr::opts_chunk$set(echo = TRUE, 
                      warning = FALSE, 
                      message = FALSE, 
                      fig.align = 'center')
knitr::opts_knit$set(root.dir = "/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex")
knitr::opts_knit$set(base.dir = "/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex")
library(caret)
library(pbapply)
library(parallel)
library(multiROC)
library(doParallel)
library(data.table)
library(ggplot2)
library(ggpubr)
library(doParallel)
cl <- makePSOCKcluster(20)
registerDoParallel(cl)
SigTab <- readRDS("./analysis/07.VirusClassification/03.Merge/00.BarcodeSignal/BarcodeSignal.Rds")
read2gene <- readRDS("./analysis/07.VirusClassification/03.Merge/00.BarcodeSignal/Reads2Barcode.Rds")
setkey(read2gene, read)
read2gene[Barcode %in% c("RTA-12", "RTA-13", "RTA-14", "RTA-15", "RTA-16", "RTA-17", "RTA-00", "RTA-01", "RTA-02", "RTA-04"), BarcodeLength := "L20"]
read2gene[Barcode %in% c("RTA-24", "RTA-25", "RTA-26", "RTA-28", "RTA-29", "RTA-27", "RTA-30", "RTA-31", "RTA-32"), BarcodeLength := "L22"]
read2gene[Barcode %in% c("RTA-03", "RTA-08", "RTA-10", "RTA-11", "RTA-18", "RTA-19", "RTA-20", "RTA-21", "RTA-22", "RTA-23"), BarcodeLength := "L24"]
read2gene[Barcode %in% c("RTA-33", "RTA-37", "RTA-34", "RTA-35", "RTA-36", "RTA-38", "RTA-39", "RTA-40", "RTA-41"), BarcodeLength := "L26"]
read2gene[Barcode %in% c("RTA-42", "RTA-43", "RTA-44", "RTA-45", "RTA-46", "RTA-47", "RTA-05", "RTA-06", "RTA-07", "RTA-09"), BarcodeLength := "L28"]

BarocdeN <- read2gene[, .N, c("Barcode", "BarcodeLength")][order(N, decreasing = T)]
my_compa <- list(c("L20", "L22"), c("L22", "L24"), c("L20", "L24"), c("L24", "L26"), c("L26", "L28"))
ggplot(BarocdeN, aes(x = BarcodeLength, y = N)) + 
  geom_boxplot() + 
  geom_jitter(height = 0) +
  theme_bw(base_size = 15) + 
  stat_compare_means(comparisons = my_compa, method = "t.test")
ggplot(BarocdeN, aes(x = BarcodeLength, y = log10(N))) + 
  geom_boxplot(outlier.shape = NA) + 
  geom_jitter(height = 0) +
  theme_bw(base_size = 15) + 
  stat_compare_means(comparisons = my_compa, method = "t.test")

L = 20

lapply(BarocdeN[, sort(unique(BarcodeLength))], function(i) {
  rbt2u <- BarocdeN[BarcodeLength == i][1:5, Barcode]
  r2g <- read2gene[Barcode %in% rbt2u]

  set.seed(9560)
  TrainingReads <- r2g[, .SD[sample(.N, 10000), ], Barcode]

  TestReads <- r2g[!read %in% TrainingReads$read, ]
  TestReads[, Class := factor(Barcode)]

  TrainingData <- SigTab[TrainingReads[, read], ]
  stopifnot(identical(row.names(TrainingData), TrainingReads$read))
  TrainingData$Class <- as.factor(TrainingReads$Barcode)
  TestData <- SigTab[TestReads[, read], ]
  stopifnot(identical(row.names(TestData), TestReads$read))
  TestData$Class <- as.factor(TestReads$Barcode)

  out <- paste0("./analysis/13.BarcodeLengthCompare/Version1/01.ModelingData/Barcode_", i, ".RData")
  save(TrainingReads, TestReads, TrainingData, TestData, file = out)
})
rbt2u <- BarocdeN[, .SD[which.max(N), ], BarcodeLength][, Barcode]
r2g <- read2gene[Barcode %in% rbt2u]

set.seed(9560)
TrainingReads <- r2g[, .SD[sample(.N, 10000), ], Barcode]

TestReads <- r2g[!read %in% TrainingReads$read, ]
TestReads[, Class := factor(Barcode)]

TrainingData <- SigTab[TrainingReads[, read], ]
stopifnot(identical(row.names(TrainingData), TrainingReads$read))
TrainingData$Class <- as.factor(TrainingReads$Barcode)
TestData <- SigTab[TestReads[, read], ]
stopifnot(identical(row.names(TestData), TestReads$read))
TestData$Class <- as.factor(TestReads$Barcode)

out <- paste0("./analysis/13.BarcodeLengthCompare/Version1/01.ModelingData/Barcode_Mix.RData")
save(TrainingReads, TestReads, TrainingData, TestData, file = out)
fitControl <- trainControl(method = "repeatedcv",
                           number = 10,
                           repeats = 10)

rf

for(i in gsub("Barcode_", "", gsub(".RData", "", list.files("./analysis/13.BarcodeLengthCompare/Version1/01.ModelingData")))) {
  load(paste0("./analysis/13.BarcodeLengthCompare/Version1/01.ModelingData/Barcode_", i, ".RData"))
  rm(list = c("TrainingReads", "TestReads", "TestData")); gc()
  set.seed(825)
  rf <- train(Class ~ ., 
              data = TrainingData,
              preProc = c("center", "scale", "YeoJohnson", "nzv"),
              method = "rf",
              trControl = fitControl,
              verbose = FALSE,
              tuneGrid = expand.grid(mtry = 35),
              # tuneLength = 10,
              metric = "Accuracy",
              allowParallel = TRUE)

  out <- paste0("./analysis/13.BarcodeLengthCompare/Version1/02.Models/Barcode_", i, ".Rds")
  saveRDS(rf, file = out)
  rm(list = c("TrainingData", "rf", "out")); gc()
}
stopCluster(cl)


tangchao7498/DecodeR documentation built on May 27, 2023, 7:21 p.m.